In [62]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import progressbar

Importing URLs to visit


In [84]:
df = pd.read_csv('allPosts.csv', sep='\t', encoding='utf-16')

In [85]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21414 entries, 0 to 21413
Data columns (total 9 columns):
Unnamed: 0    21414 non-null int64
Date_1        21414 non-null int64
Date_2        21414 non-null int64
Date_3        21414 non-null object
ID_page       21414 non-null int64
ID_post       21414 non-null int64
Link          21414 non-null object
Title         21414 non-null object
Txt           18882 non-null object
dtypes: int64(5), object(4)
memory usage: 1.5+ MB

In [88]:
urls = list(df['Link'])
titles = list(df['Title'])

Visit on URLs


In [57]:
urls[1000]


Out[57]:
'https://daphnecaruanagalizia.com/2015/10/erin-tanti-pulls-out-of-university/'

In [58]:
url = urls[1000]
response = requests.get(url)
t = BeautifulSoup(response.text, 'html.parser')

In [59]:
t.find_all('p')


Out[59]:
[<p style="width: 130px;float: left;text-align: left; margin:-4px 0 0 -5px;">
 <a href="/">« back to home</a>
 </p>,
 <p class="no-break"><span class="st_facebook_hcount" st_title="Erin Tanti pulls out of university" st_url="https://daphnecaruanagalizia.com/2015/10/erin-tanti-pulls-out-of-university/"></span> <span class="st_twitter_hcount" st_title="Erin Tanti pulls out of university" st_url="https://daphnecaruanagalizia.com/2015/10/erin-tanti-pulls-out-of-university/" st_username="dcgblog" st_via="dcgblog"></span> <span class="st_linkedin_hcount" st_title="Erin Tanti pulls out of university" st_url="https://daphnecaruanagalizia.com/2015/10/erin-tanti-pulls-out-of-university/"></span></p>,
 <p><a href="https://daphnecaruanagalizia.com/wp-content/uploads/2014/03/erin_stuart_tanti_1.jpg.png"><img alt="erin_stuart_tanti_1.jpg" class="aligncenter size-full wp-image-45077" height="400" sizes="(max-width: 684px) 100vw, 684px" src="https://daphnecaruanagalizia.com/wp-content/uploads/2014/03/erin_stuart_tanti_1.jpg.png" srcset="https://daphnecaruanagalizia.com/wp-content/uploads/2014/03/erin_stuart_tanti_1.jpg.png 684w, https://daphnecaruanagalizia.com/wp-content/uploads/2014/03/erin_stuart_tanti_1.jpg-300x175.png 300w" width="684"/></a></p>,
 <p>I am informed that Erin Tanti, who is to stand trial for the murder of one of his 15-year-old pupils, but who is out on bail, has pulled out of university.</p>,
 <p>Tanti had enrolled at the University of Malta to begin reading for a bachelor’s degree in psychology and anthropology. He began going to lectures this month. A controversy erupted following a report on this website, which led to the Attorney General requesting the court to appoint three psychiatrists to assess whether it is safe to allow him in that environment, for the safety of others and his own.</p>,
 <p><span class="st_facebook_hcount" st_title="Erin Tanti pulls out of university" st_url="https://daphnecaruanagalizia.com/2015/10/erin-tanti-pulls-out-of-university/"></span> <span class="st_twitter_hcount" st_title="Erin Tanti pulls out of university" st_url="https://daphnecaruanagalizia.com/2015/10/erin-tanti-pulls-out-of-university/" st_username="dcgblog" st_via="dcgblog"></span> <span class="st_linkedin_hcount" st_title="Erin Tanti pulls out of university" st_url="https://daphnecaruanagalizia.com/2015/10/erin-tanti-pulls-out-of-university/"></span></p>,
 <p style="width: 45%;float: left;text-align: left;"><a href="https://daphnecaruanagalizia.com/2015/10/the-department-of-information-gets-its-chrises-confused/" rel="prev">« previous post</a></p>,
 <p style="width: 45%;float: right; text-align: right;"><a href="https://daphnecaruanagalizia.com/2015/10/man-in-staged-electoral-campaign-visit-put-on-state-payroll-at-transport-malta/" rel="next">next post »</a></p>]

In [60]:
at = ''
for elem in t.find_all('p'):
    at += elem.text

In [61]:
at


Out[61]:
'\n« back to home\n  I am informed that Erin Tanti, who is to stand trial for the murder of one of his 15-year-old pupils, but who is out on bail, has pulled out of university.Tanti had enrolled at the University of Malta to begin reading for a bachelor’s degree in psychology and anthropology. He began going to lectures this month. A controversy erupted following a report on this website, which led to the Attorney General requesting the court to appoint three psychiatrists to assess whether it is safe to allow him in that environment, for the safety of others and his own.  « previous postnext post »'

Visiting all URLs


In [89]:
bar = progressbar.ProgressBar()

text_list = []
for url,i in zip(urls[:100], bar(range(len(urls[:100])))):
    response = requests.get(url)
    t = BeautifulSoup(response.text, 'html.parser') 
    at = ''
    for elem in t.find_all('p'):
        at += elem.text
    text_list.append({'Text':at})


 99% (99 of 100) |######################## | Elapsed Time: 0:01:41 ETA: 0:00:01

In [90]:
txt_lst = list(pd.DataFrame(text_list)['Text'])

Connecting all posts


In [93]:
df_t = df[:100]

In [94]:
df_t['Text'] = txt_lst


/Users/barneyjs/.virtualenvs/master/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

In [96]:
df_t.to_csv('allPostText_test.csv')

In [ ]: